Write a short description about the course and add a link to your GitHub repository here. This is an R Markdown (.Rmd) file so you should use R Markdown syntax.
Testi
# This is a so-called "R chunk" where you can write R code.
date()
## [1] "Mon Dec 07 10:42:20 2020"
The text continues here.
Describe the work you have done this week and summarize your learning.
lrn2014 <- read.csv("C:/Users/Juhana/Documents/IODS-project-1/data/learning2014.csv", row.names = 1)
alcGit <- read.csv("https://github.com/JunzQ7/IODS-project/tree/master/data/alc.csv")
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
p <- ggpairs(lrn2014, mapping = aes(), lower = list(combo = wrap("facethist", bins = 20)))
p
model = lm(Points~Attitude+stra+surf, data = lrn2014)
summary(model)
##
## Call:
## lm(formula = Points ~ Attitude + stra + surf, data = lrn2014)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.1550 -3.4346 0.5156 3.6401 10.8952
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.01711 3.68375 2.991 0.00322 **
## Attitude 0.33952 0.05741 5.913 1.93e-08 ***
## stra 0.85313 0.54159 1.575 0.11716
## surf -0.58607 0.80138 -0.731 0.46563
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.296 on 162 degrees of freedom
## Multiple R-squared: 0.2074, Adjusted R-squared: 0.1927
## F-statistic: 14.13 on 3 and 162 DF, p-value: 3.156e-08
model2 = lm(Points~Attitude, data = lrn2014)
summary(model2)
##
## Call:
## lm(formula = Points ~ Attitude, data = lrn2014)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.9763 -3.2119 0.4339 4.1534 10.6645
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.63715 1.83035 6.358 1.95e-09 ***
## Attitude 0.35255 0.05674 6.214 4.12e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.32 on 164 degrees of freedom
## Multiple R-squared: 0.1906, Adjusted R-squared: 0.1856
## F-statistic: 38.61 on 1 and 164 DF, p-value: 4.119e-09
plot(model2, which = c(1,2,5))
alc <- read.csv("data/alc.csv", row.names = 1)
Selected variables: sex, absences, internet, activities
library(ggplot2)
g1 <- ggplot(alc, aes(x = high_use, y = absences))
g1 + geom_boxplot()
(prop.table(table(alc$sex, alc$high_use), margin = 1))
##
## FALSE TRUE
## F 0.7878788 0.2121212
## M 0.6086957 0.3913043
m <- glm(high_use ~ absences + sex + internet + activities, data = alc, family = "binomial")
summary(m)
##
## Call:
## glm(formula = high_use ~ absences + sex + internet + activities,
## family = "binomial", data = alc)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3480 -0.8517 -0.6163 1.1187 2.0931
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.68510 0.35648 -4.727 2.28e-06 ***
## absences 0.09672 0.02315 4.178 2.95e-05 ***
## sexM 1.02724 0.24411 4.208 2.57e-05 ***
## internetyes 0.02369 0.33998 0.070 0.944
## activitiesyes -0.38679 0.23891 -1.619 0.105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 465.68 on 381 degrees of freedom
## Residual deviance: 427.43 on 377 degrees of freedom
## AIC: 437.43
##
## Number of Fisher Scoring iterations: 4
coef(m)
## (Intercept) absences sexM internetyes activitiesyes
## -1.68509737 0.09672074 1.02724315 0.02369222 -0.38678774
probabilities <- predict(m, type = "response")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# add the predicted probabilities to 'alc'
alc <- mutate(alc, probability = probabilities)
# use the probabilities to make a prediction of high_use
alc <- mutate(alc, prediction = probability > 0.5)
# tabulate the target variable versus the predictions
table(high_use = alc$high_use, prediction = alc$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 257 11
## TRUE 88 26
# access dplyr and ggplot2
library(dplyr); library(ggplot2)
# initialize a plot of 'high_use' versus 'probability' in 'alc'
g <- ggplot(alc, aes(x = probability, y = high_use, col = prediction))
# define the geom as points and draw the plot
g + geom_point()
# tabulate the target variable versus the predictions
table(high_use = alc$high_use, prediction = alc$prediction) %>% prop.table %>% addmargins
## prediction
## high_use FALSE TRUE Sum
## FALSE 0.67277487 0.02879581 0.70157068
## TRUE 0.23036649 0.06806283 0.29842932
## Sum 0.90314136 0.09685864 1.00000000
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
data(Boston)
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
boston_scaled = scale(Boston)
str(boston_scaled)
## num [1:506, 1:14] -0.419 -0.417 -0.417 -0.416 -0.412 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:506] "1" "2" "3" "4" ...
## ..$ : chr [1:14] "crim" "zn" "indus" "chas" ...
## - attr(*, "scaled:center")= Named num [1:14] 3.6135 11.3636 11.1368 0.0692 0.5547 ...
## ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
## - attr(*, "scaled:scale")= Named num [1:14] 8.602 23.322 6.86 0.254 0.116 ...
## ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
boston_scaled <- as.data.frame(boston_scaled)
bins <- quantile(boston_scaled$crim)
bins
## 0% 25% 50% 75% 100%
## -0.419366929 -0.410563278 -0.390280295 0.007389247 9.924109610
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))
boston_scaled <- dplyr::select(boston_scaled, -crim)
boston_scaled <- data.frame(boston_scaled, crime)
n <- nrow(boston_scaled)
# choose randomly 80% of the rows
ind <- sample(n, size = n * 0.8)
# create train set
train <- boston_scaled[ind,]
# create test set
test <- boston_scaled[-ind,]
# save the correct classes from test data
correct_classes <- test$crime
# remove the crime variable from test data
test <- dplyr::select(test, -crime)
# LDA
lda.fit <- lda(crime ~ ., data = train)
# print the lda.fit object
lda.fit
## Call:
## lda(crime ~ ., data = train)
##
## Prior probabilities of groups:
## low med_low med_high high
## 0.2648515 0.2450495 0.2400990 0.2500000
##
## Group means:
## zn indus chas nox rm age
## low 0.99963472 -0.9502186 -0.12514775 -0.8971329 0.3980328 -0.9581286
## med_low -0.04980677 -0.2740253 -0.03371693 -0.5719032 -0.1874349 -0.3762519
## med_high -0.39794966 0.2665885 0.25532354 0.4168615 0.1484944 0.4327635
## high -0.48724019 1.0171306 -0.07742312 1.0599090 -0.4152660 0.8262068
## dis rad tax ptratio black lstat
## low 0.9300311 -0.6802644 -0.7266141 -0.38352002 0.3787572 -0.77653809
## med_low 0.4359209 -0.5445257 -0.4894452 -0.08117323 0.3052924 -0.08644452
## med_high -0.4014002 -0.3780380 -0.2654745 -0.31755642 0.0557307 0.00925801
## high -0.8620530 1.6379981 1.5139626 0.78062517 -0.8224625 0.90069064
## medv
## low 0.48510772
## med_low -0.02992579
## med_high 0.23956195
## high -0.69976120
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3
## zn 0.09778179 0.644664607 -0.89990008
## indus 0.03380819 -0.429698464 0.47304154
## chas -0.08559858 -0.046481431 0.04002871
## nox 0.40676788 -0.633348612 -1.32847500
## rm -0.09102589 -0.127207631 -0.22609171
## age 0.31226677 -0.330683710 -0.11391794
## dis -0.04348803 -0.289299715 0.33358859
## rad 2.96339347 0.961083740 0.11065576
## tax 0.02786398 -0.002940165 0.31086437
## ptratio 0.08899162 0.062808218 -0.30639516
## black -0.15292134 0.037540808 0.12885768
## lstat 0.26378306 -0.217671883 0.43734284
## medv 0.22638287 -0.436918873 -0.12973229
##
## Proportion of trace:
## LD1 LD2 LD3
## 0.9407 0.0447 0.0146
# the function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "orange", tex = 0.75, choices = c(1,2)){
heads <- coef(x)
arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)
}
# target classes as numeric
classes <- as.numeric(train$crime)
# plot the lda results
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 1)
# predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)
# cross tabulate the results
table(correct = correct_classes, predicted = lda.pred$class)
## predicted
## correct low med_low med_high high
## low 12 5 3 0
## med_low 3 20 4 0
## med_high 0 16 13 0
## high 0 0 0 26
# load MASS and Boston
library(MASS)
library(ggplot2)
data('Boston')
Boston <- scale(Boston)
# euclidean distance matrix
dist_eu <- dist(Boston)
# look at the summary of the distances
summary(dist_eu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1343 3.4625 4.8241 4.9111 6.1863 14.3970
# manhattan distance matrix
dist_man <- dist(Boston, method = 'manhattan')
# look at the summary of the distances
summary(dist_man)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2662 8.4832 12.6090 13.5488 17.7568 48.8618
# k-means clustering
km <-kmeans(Boston, centers = 3)
# plot the Boston dataset with clusters
pairs(Boston, col = km$cluster)
set.seed(123)
# determine the number of clusters
k_max <- 10
# calculate the total within sum of squares
twcss <- sapply(1:k_max, function(k){kmeans(Boston, k)$tot.withinss})
# visualize the results
qplot(x = 1:k_max, y = twcss, geom = 'line')
data(Boston)
Boston <- as.data.frame(scale(Boston))
km <- kmeans(Boston, centers = 5)
Boston$km <- km$cluster
lda.fit <- lda(km~., Boston)
classes <- as.numeric(Boston$km)
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 1)
library(MASS)
data(Boston)
boston_scaled = scale(Boston)
boston_scaled <- as.data.frame(boston_scaled)
bins <- quantile(boston_scaled$crim)
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))
boston_scaled <- dplyr::select(boston_scaled, -crim)
boston_scaled <- data.frame(boston_scaled, crime)
n <- nrow(boston_scaled)
ind <- sample(n, size = n * 0.8)
train <- boston_scaled[ind,]
test <- boston_scaled[-ind,]
correct_classes <- test$crime
test <- dplyr::select(test, -crime)
# LDA
lda.fit <- lda(crime ~ ., data = train)
model_predictors <- dplyr::select(train, -crime)
# check the dimensions
dim(model_predictors)
## [1] 404 13
dim(lda.fit$scaling)
## [1] 13 3
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = as.numeric(train$crime))
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
library(MASS)
library(dplyr)
library(stringr)
human <- read.table("http://s3.amazonaws.com/assets.datacamp.com/production/course_2218/datasets/human1.txt", sep =",", header = T)
human$GNI <- str_replace(human$GNI, pattern=",", replace ="") %>% as.numeric
keep <- c("Country", "Edu2.FM", "Labo.FM", "Life.Exp", "Edu.Exp", "GNI", "Mat.Mor", "Ado.Birth", "Parli.F")
# select the 'keep' columns
human <- dplyr::select(human, one_of(keep))
# print out a completeness indicator of the 'human' data
human <- human[complete.cases(human),]
last <- nrow(human) - 7
# choose everything until the last 7 observations
human <- human[1:last, ]
# add countries as rownames
rownames(human) <- human$Country
human <- dplyr::select(human, -Country)
library(ggplot2)
library(GGally)
library(corrplot)
## corrplot 0.84 loaded
ggpairs(human)
cor(human) %>% corrplot
human_std <- scale(human)
# print out summaries of the standardized variables
summary(human_std)
## Edu2.FM Labo.FM Life.Exp Edu.Exp
## Min. :-2.8189 Min. :-2.6247 Min. :-2.7188 Min. :-2.7378
## 1st Qu.:-0.5233 1st Qu.:-0.5484 1st Qu.:-0.6425 1st Qu.:-0.6782
## Median : 0.3503 Median : 0.2316 Median : 0.3056 Median : 0.1140
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5958 3rd Qu.: 0.7350 3rd Qu.: 0.6717 3rd Qu.: 0.7126
## Max. : 2.6646 Max. : 1.6632 Max. : 1.4218 Max. : 2.4730
## GNI Mat.Mor Ado.Birth Parli.F
## Min. :-0.9193 Min. :-0.6992 Min. :-1.1325 Min. :-1.8203
## 1st Qu.:-0.7243 1st Qu.:-0.6496 1st Qu.:-0.8394 1st Qu.:-0.7409
## Median :-0.3013 Median :-0.4726 Median :-0.3298 Median :-0.1403
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.3712 3rd Qu.: 0.1932 3rd Qu.: 0.6030 3rd Qu.: 0.6127
## Max. : 5.6890 Max. : 4.4899 Max. : 3.8344 Max. : 3.1850
pca_human <- prcomp(human)
# perform principal component analysis (with the SVD method)
pca_human_std <- prcomp(human_std)
# draw a biplot of the principal component representation and the original variables
biplot(pca_human, choices = 1:2, cex = c(0.8, 1), col = c("grey40", "deeppink2"))
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
biplot(pca_human_std, choices = 1:2, cex = c(0.8, 1), col = c("grey40", "deeppink2"))
pca_human_std
## Standard deviations (1, .., p=8):
## [1] 2.0708380 1.1397204 0.8750485 0.7788630 0.6619563 0.5363061 0.4589994
## [8] 0.3222406
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4 PC5
## Edu2.FM -0.35664370 0.03796058 -0.24223089 0.62678110 -0.5983585
## Labo.FM 0.05457785 0.72432726 -0.58428770 0.06199424 0.2625067
## Life.Exp -0.44372240 -0.02530473 0.10991305 -0.05834819 0.1628935
## Edu.Exp -0.42766720 0.13940571 -0.07340270 -0.07020294 0.1659678
## GNI -0.35048295 0.05060876 -0.20168779 -0.72727675 -0.4950306
## Mat.Mor 0.43697098 0.14508727 -0.12522539 -0.25170614 -0.1800657
## Ado.Birth 0.41126010 0.07708468 0.01968243 0.04986763 -0.4672068
## Parli.F -0.08438558 0.65136866 0.72506309 0.01396293 -0.1523699
## PC6 PC7 PC8
## Edu2.FM 0.17713316 0.05773644 0.16459453
## Labo.FM -0.03500707 -0.22729927 -0.07304568
## Life.Exp -0.42242796 -0.43406432 0.62737008
## Edu.Exp -0.38606919 0.77962966 -0.05415984
## GNI 0.11120305 -0.13711838 -0.16961173
## Mat.Mor 0.17370039 0.35380306 0.72193946
## Ado.Birth -0.76056557 -0.06897064 -0.14335186
## Parli.F 0.13749772 0.00568387 -0.02306476
library(FactoMineR)
library(tidyr)
data(tea)
keep_columns <- c("Tea", "How", "how", "sugar", "where", "lunch")
# select the 'keep_columns' to create a new dataset
tea_time <- dplyr::select(tea, one_of(keep_columns))
# look at the summaries and structure of the data
summary(tea_time)
## Tea How how sugar
## black : 74 alone:195 tea bag :170 No.sugar:155
## Earl Grey:193 lemon: 33 tea bag+unpackaged: 94 sugar :145
## green : 33 milk : 63 unpackaged : 36
## other: 9
## where lunch
## chain store :192 lunch : 44
## chain store+tea shop: 78 Not.lunch:256
## tea shop : 30
##
str(tea_time)
## 'data.frame': 300 obs. of 6 variables:
## $ Tea : Factor w/ 3 levels "black","Earl Grey",..: 1 1 2 2 2 2 2 1 2 1 ...
## $ How : Factor w/ 4 levels "alone","lemon",..: 1 3 1 1 1 1 1 3 3 1 ...
## $ how : Factor w/ 3 levels "tea bag","tea bag+unpackaged",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ sugar: Factor w/ 2 levels "No.sugar","sugar": 2 1 1 2 1 1 1 1 1 1 ...
## $ where: Factor w/ 3 levels "chain store",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ lunch: Factor w/ 2 levels "lunch","Not.lunch": 2 2 2 2 2 2 2 2 2 2 ...
# visualize the dataset
gather(tea_time) %>% ggplot(aes(value)) + facet_wrap("key", scales = "free") + geom_bar() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
## Warning: attributes are not identical across measure variables;
## they will be dropped
mca <- MCA(tea_time, graph = F)
# summary of the model
summary(mca)
##
## Call:
## MCA(X = tea_time, graph = F)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 0.279 0.261 0.219 0.189 0.177 0.156 0.144
## % of var. 15.238 14.232 11.964 10.333 9.667 8.519 7.841
## Cumulative % of var. 15.238 29.471 41.435 51.768 61.434 69.953 77.794
## Dim.8 Dim.9 Dim.10 Dim.11
## Variance 0.141 0.117 0.087 0.062
## % of var. 7.705 6.392 4.724 3.385
## Cumulative % of var. 85.500 91.891 96.615 100.000
##
## Individuals (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## 1 | -0.298 0.106 0.086 | -0.328 0.137 0.105 | -0.327
## 2 | -0.237 0.067 0.036 | -0.136 0.024 0.012 | -0.695
## 3 | -0.369 0.162 0.231 | -0.300 0.115 0.153 | -0.202
## 4 | -0.530 0.335 0.460 | -0.318 0.129 0.166 | 0.211
## 5 | -0.369 0.162 0.231 | -0.300 0.115 0.153 | -0.202
## 6 | -0.369 0.162 0.231 | -0.300 0.115 0.153 | -0.202
## 7 | -0.369 0.162 0.231 | -0.300 0.115 0.153 | -0.202
## 8 | -0.237 0.067 0.036 | -0.136 0.024 0.012 | -0.695
## 9 | 0.143 0.024 0.012 | 0.871 0.969 0.435 | -0.067
## 10 | 0.476 0.271 0.140 | 0.687 0.604 0.291 | -0.650
## ctr cos2
## 1 0.163 0.104 |
## 2 0.735 0.314 |
## 3 0.062 0.069 |
## 4 0.068 0.073 |
## 5 0.062 0.069 |
## 6 0.062 0.069 |
## 7 0.062 0.069 |
## 8 0.735 0.314 |
## 9 0.007 0.003 |
## 10 0.643 0.261 |
##
## Categories (the 10 first)
## Dim.1 ctr cos2 v.test Dim.2 ctr cos2
## black | 0.473 3.288 0.073 4.677 | 0.094 0.139 0.003
## Earl Grey | -0.264 2.680 0.126 -6.137 | 0.123 0.626 0.027
## green | 0.486 1.547 0.029 2.952 | -0.933 6.111 0.107
## alone | -0.018 0.012 0.001 -0.418 | -0.262 2.841 0.127
## lemon | 0.669 2.938 0.055 4.068 | 0.531 1.979 0.035
## milk | -0.337 1.420 0.030 -3.002 | 0.272 0.990 0.020
## other | 0.288 0.148 0.003 0.876 | 1.820 6.347 0.102
## tea bag | -0.608 12.499 0.483 -12.023 | -0.351 4.459 0.161
## tea bag+unpackaged | 0.350 2.289 0.056 4.088 | 1.024 20.968 0.478
## unpackaged | 1.958 27.432 0.523 12.499 | -1.015 7.898 0.141
## v.test Dim.3 ctr cos2 v.test
## black 0.929 | -1.081 21.888 0.382 -10.692 |
## Earl Grey 2.867 | 0.433 9.160 0.338 10.053 |
## green -5.669 | -0.108 0.098 0.001 -0.659 |
## alone -6.164 | -0.113 0.627 0.024 -2.655 |
## lemon 3.226 | 1.329 14.771 0.218 8.081 |
## milk 2.422 | 0.013 0.003 0.000 0.116 |
## other 5.534 | -2.524 14.526 0.197 -7.676 |
## tea bag -6.941 | -0.065 0.183 0.006 -1.287 |
## tea bag+unpackaged 11.956 | 0.019 0.009 0.000 0.226 |
## unpackaged -6.482 | 0.257 0.602 0.009 1.640 |
##
## Categorical variables (eta2)
## Dim.1 Dim.2 Dim.3
## Tea | 0.126 0.108 0.410 |
## How | 0.076 0.190 0.394 |
## how | 0.708 0.522 0.010 |
## sugar | 0.065 0.001 0.336 |
## where | 0.702 0.681 0.055 |
## lunch | 0.000 0.064 0.111 |
# visualize MCA
plot(mca, invisible=c("ind"), habillage = "quali", xlim = c(-2,2), cex = .8)
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
Visit http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/114-mca-multiple-correspondence-analysis-in-r-essentials/ for more MCA visualizations!